library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
path = "/Users/tina/Documents/School/Capstone/Dallas Files/"
#vacant_parcels <- read.csv(paste0(path, "CPAL/DCAD_parcels_vac.csv"), stringsAsFactors = F)
#abatement_exempt <- read.csv(paste0(path, "DCAD2019/abatement_exempt.csv"), stringsAsFactors = F)
account_apprl_year <- read.csv(paste0(path, "DCAD2020_CURRENT/account_apprl_year.csv"), nrows=50000, stringsAsFactors = F)
account_info <- read.csv(paste0(path, "DCAD2020_CURRENT/account_info.csv"), nrows=50000, stringsAsFactors = F)
#account_tif <- read.csv(paste0(path, "DCAD2019/account_tif_small.csv"), stringsAsFactors = F)
account_exempt_value <- read.csv(paste0(path, "DCAD2020_CURRENT/acct_exempt_value.csv"), nrows=10000, stringsAsFactors = F)
#account_exempt_value$ACCOUNT_NUM <- as.character(account_exempt_value$ACCOUNT_NUM)
#applied_std_exempt <- read.csv(paste0(path, "DCAD2019/applied_std_exempt_small.csv"), stringsAsFactors = F)
#com_detail <- read.csv(paste0(path, "DCAD2019/com_detail_small.csv"), stringsAsFactors = F)
#land <- read.csv(paste0(path, "DCAD2019/land_small.csv"), stringsAsFactors = F)
#land$ACCOUNT_NUM <- as.character(land$ACCOUNT_NUM)
#multi_owner <- read.csv(paste0(path, "DCAD2019/multi_owner_small.csv"), stringsAsFactors = F)
#res_addl <- read.csv(paste0(path, "DCAD2019/res_addl_small.csv"), stringsAsFactors = F)
#res_detail <- read.csv(paste0(path, "DCAD2019/res_detail_small.csv"), stringsAsFactors = F)
#taxable_object <- read.csv(paste0(path, "DCAD2019/taxable_object_small.csv"), stringsAsFactors = F)
#taxable_object$ACCOUNT_NUM <- as.character(taxable_object$ACCOUNT_NUM)
#taxable_object$TAX_OBJ_ID <- as.character(taxable_object$TAX_OBJ_ID)
# Join all the files together

# df = #vacant_parcels %>%
#   #left_join(account_apprl_year) %>%
#   account_apprl_year %>%
#   left_join(account_info) %>%
#   left_join(account_tif) %>%
#   left_join(account_exempt_value) %>%
#   left_join(applied_std_exempt) %>%
#   left_join(com_detail) %>%
#   left_join(land) %>%
#   left_join(multi_owner) %>%
#   left_join(res_addl) %>%
#   left_join(res_detail) %>%
#   left_join(taxable_object)
df = account_info %>%
  left_join(account_apprl_year) 
## Joining, by = c("ACCOUNT_NUM", "APPRAISAL_YR", "DIVISION_CD", "GIS_PARCEL_ID")
summary(df$IMPR_VAL)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0        0    71720   229377   180875 41386600    47341
df_gg = df %>%
  filter(!is.na(IMPR_VAL)) %>%
  filter(IMPR_VAL < 37794550) %>%
  group_by(GIS_PARCEL_ID) %>%
  summarise_all(max) %>% #deduplicate by parcel id and keep the max of all other columns
  mutate(lt_1k_impr_val = ifelse(IMPR_VAL < 1000, "yes", "no"))

ggplot(df_gg, aes(x=lt_1k_impr_val, y = ..count.., fill=lt_1k_impr_val)) +
           geom_bar() + 
          ggtitle("Count of Parcels <$1000 Improvement Value")

# bar graph of things less than 2k improvement

df = account_info %>%
  left_join(account_apprl_year) 
## Joining, by = c("ACCOUNT_NUM", "APPRAISAL_YR", "DIVISION_CD", "GIS_PARCEL_ID")
summary(factor(df$LMA))
## UNASSIGNED     2DSK10     2DSK13     1DSDE1     1DSJ10     2DSK09     2DSK11 
##      45342        317        309        283        280        191        174 
##     2DSK15     2DSK9A     1DSI10     2DSK14     1DSJ7G     1DSJ11     1DSJ7A 
##        149        139        125         90         85         82         78 
##     1DSOL1     1DSJ7F     2DSL10     1DSJ09     1DSI12     1DSJ7B     1DSOL7 
##         78         77         71         68         64         60         54 
##     1DSJ7E     2DSK16     1DSI09     1DSOL8     1DSOL6     1DSI11     2DSK08 
##         51         51         46         46         43         41         40 
##     1DSOL4     2DSL6B     2DSL7A     1DSOL2     1DSK07     1DSCEX     1DSOL3 
##         38         38         37         35         33         31         28 
##     1DSCE9     1DSH6B     1DSJ6A     2WS006     1OS005     1DSJ7D     1ISA08 
##         26         25         25         24         22         21         20 
##     1DSOL5     1DSK7A     2DSL09     2LS008     1DSI7A     1DSK6A     1DSJ06 
##         19         18         18         18         16         16         15 
##     2SS003     1DSH9C     2DSQ03     3DSI02     1DSK04     2DSL6A     2DSM07 
##         15         14         14         14         13         13         13 
##     3MSF01     1DSCE8     1DSH06     2DSCL1     2DSL07     2DSL12     2LS011 
##         13         12         12         12         12         12         12 
##     2PS001     3DSL01     1DSG04     1DSJ12     2DSQ04     2WS005     1DSJ5A 
##         12         12         11         11         11         11         10 
##     2DSK17     2DSN10     2DSR14     2PS002     3MSD05     1DSI05     3GSE02 
##         10         10         10         10         10          9          9 
##     3GSE03     3MSB02     3MSG02     1DSH16     1DSH9A     2DSM08     2DSP12 
##          9          9          9          8          8          8          8 
##     3DSI01     3DSM01     1DSJ7C     1DSK05     2DSK23     2DSL06     2DSN07 
##          8          8          7          7          7          7          7 
##     2DSN09     2LS007     2LS010     2SS002     2WS004     3DSJ03     3MSE01 
##          7          7          7          7          7          7          7 
##     3YSB05    (Other) 
##          7        550
summary(df$IMPR_VAL)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0        0    71720   229377   180875 41386600    47341
df_gg = df %>%
  filter(!is.na(LMA)) %>%
  filter(IMPR_VAL < 37794550) %>%
  mutate(lt_1k_impr_val = ifelse(IMPR_VAL < 1000, "yes", "no"))

ggplot(df_gg, aes(x=LMA, y = ..count.., fill=lt_1k_impr_val)) +
           geom_bar() + 
          ggtitle("Count of Parcels <$1000 Improvement Value by LMA")

df_gg %>% 
  count(lt_1k_impr_val = factor(lt_1k_impr_val)) %>% 
  mutate(pct = prop.table(n)) %>%
  ggplot(aes(x = lt_1k_impr_val, y = pct, fill = lt_1k_impr_val, label = paste(scales::percent(pct), "\n ( n=", n, ")"))) + 
  geom_col(position = 'dodge') + 
  geom_text(position = position_dodge(width = .9),    # move to center of bars
            vjust = .5,    # nudge above top of bar
            size = 3) + 
  scale_y_continuous(labels = scales::percent) + 
  ggtitle("Percent Parcels Less Than $1k Improvement Value")

summary(factor(df$NBHD_CD))
## UNASSIGNED     1DSA04     1DSA06     1DSD17     1DSA02     1DSD11     1DSG08 
##       4658       1308       1251        750        557        491        428 
##     1DSA07     1DSI01     1DS907     1DSJ22     1DSG06     1DSJ03     1DSM15 
##        396        377        349        289        276        247        207 
##     4DSG70     1DSD08     4DSE13     1DSJ19     1DSD14     1DSD04     4DSG15 
##        203        196        190        174        168        167        159 
##     1DSM26     1DSJ11     1DSJ12     1DSA05     1DSG11     1DSK01     4DSD39 
##        154        148        148        146        142        141        141 
##     4DSG85     4DSD89     1DSB01     4DSE18     1DSM17     1DSD12     1DSQ01 
##        139        136        133        130        129        128        127 
##     1DSJ21     1DSJ06     1DSJ08     5DSD06     1DSM12     4DSE27     1DSD16 
##        125        123        123        122        121        116        108 
##     1DSG09     4DSG44     4DSG40     1DS909     1DSJ24     4DSG45     1DSJ1H 
##        108        108        107        106        106        104        101 
##     1DSM21     1DSA01     1DSD03     4DSG11     4DSD79     4DSD13     1DSA10 
##        101         99         99         98         96         95         94 
##     4DSA14     4DSG08     1DSM19     5DSG05     5DSF01     1DSA09     2RSI01 
##         90         90         88         87         86         84         84 
##     4DSD42     4DSD44     4DSF10     1DSJ20     4DSF11     4DSI01     4DSM02 
##         84         80         78         77         76         76         76 
##     1DSM11     4DSD40     4DSE31     1DSF01     4DSD88     5DSJ13     1DSM04 
##         75         74         73         72         72         72         71 
##     4DSE29     4DSI27     4WSI28     4DSD81     1DSM13     4DSD11     4DSD36 
##         70         70         70         69         67         67         67 
##     1DSF03     1DSJ18     1DSM20     4DSG72     4DSH16     5IS108     4DSG47 
##         65         65         65         65         64         64         63 
##     4WSD06     1DSJ27     3MSG21     5DS311     3DSJ13     3GSH07     4DSD38 
##         63         62         62         62         61         61         61 
##     4DSD86    (Other) 
##         61      29868
summary(df$IMPR_VAL)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0        0    71720   229377   180875 41386600    47341
df_gg = df %>%
  filter(!is.na(NBHD_CD)) %>%
  filter(IMPR_VAL < 37794550) %>%
  mutate(lt_1k_impr_val = ifelse(IMPR_VAL < 1000, "yes", "no"))

ggplot(df_gg, aes(x=NBHD_CD, y = ..count.., fill=lt_1k_impr_val)) +
           geom_bar() + 
          theme(axis.text.x = element_text(angle = 90)) +
          ggtitle("Count of Parcels <$1000 Improvement Value by NBHD_CD")

# account_apprl_year X account_exempt
# contains auxiliary data for each appraisal account in Account_Apprl_Year.

df =  account_exempt_value %>%
  left_join(account_apprl_year) 
## Joining, by = c("ACCOUNT_NUM", "APPRAISAL_YR")
summary(factor(df$CITY_APPLD_VAL))
##   1e+05   12000       0    5000   10000    7500   50000     100   56968  135000 
##     780     767     288     226     151     121      24       9       7       7 
##  160000  187500  312500  425000  525000   66000   99000  271960   11806   51120 
##       7       7       7       7       7       6       6       6       5       5 
##   71088  375000      60    8550   39102   47112   60000   81000  130000  140000 
##       5       5       4       4       4       4       4       4       4       4 
##  145200  150000  154000  166250  180000  325000    6408   10025   10245   12056 
##       4       4       4       4       4       4       3       3       3       3 
##   14412   14642   15157   17000   18464   19074   19296   24134   25313   26372 
##       3       3       3       3       3       3       3       3       3       3 
##   31092   37000   45144   54000   60912   67000   70000   74000   75000   76384 
##       3       3       3       3       3       3       3       3       3       3 
##   78000   78896   80000   91140   95000   96000   98000  101000  104000  105000 
##       3       3       3       3       3       3       3       3       3       3 
##  105024  120000  125000  134000  141000  158000  168000  170000  185000  188000 
##       3       3       3       3       3       3       3       3       3       3 
##  198000   2e+05  205000  208000  220000  224000  240000  246000   3e+05  340000 
##       3       3       3       3       3       3       3       3       3       3 
##  342500  382500    1700    5045    5072    5428    5870    7744    7846 (Other) 
##       3       3       2       2       2       2       2       2       2    7309
summary(df$IMPR_VAL)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0    32975   189440   770077   379400 72251070     9450
df_gg = df %>%
  #filter(!is.na(CITY_APPLD_VAL)) %>%
  filter(IMPR_VAL < 37794550)

#  select(CITY_APPLD_VAL, CNTY_APPLD_VAL, ISD_APPLD_VAL, HOSPITAL_APPLD_VAL, COLLEGE_APPLD_VAL, SPCL_APPLIED_VAL)
summary(factor(df_gg$CITY_APPLD_VAL))
##   1e+05   12000   10000    5000    7500       0     100      60  271960  425000 
##      57      36       9       8       7       5       5       3       3       3 
##   71088  134000  325000      30    5651    7400    9844   10037   17000   17682 
##       2       2       2       1       1       1       1       1       1       1 
##   18424   19013   20000   20248   21601   24108   25293   26060   26174   26300 
##       1       1       1       1       1       1       1       1       1       1 
##   27572   27830   28353   28898   28915   29240   30874   30988   31438   31612 
##       1       1       1       1       1       1       1       1       1       1 
##   32286   33038   33166   33656   35609   35710   36050   36131   37000   37618 
##       1       1       1       1       1       1       1       1       1       1 
##   38720   40675   41050   41062   43222   43372   43622   43992   45112   45174 
##       1       1       1       1       1       1       1       1       1       1 
##   45468   48291   50000   50744   50751   51323   52840   56400   57665   59172 
##       1       1       1       1       1       1       1       1       1       1 
##   59382   59400   59512   61366   61560   61954   65068   65940   66088   66440 
##       1       1       1       1       1       1       1       1       1       1 
##   67930   68594   69672   70272   71837   71846   72356   72363   73284   73792 
##       1       1       1       1       1       1       1       1       1       1 
##   74147   74814   75030   75086   75532   75738   75826   75962   76000 (Other) 
##       1       1       1       1       1       1       1       1       1     318
ggplot(df_gg, aes(x=CITY_APPLD_VAL, y = IMPR_VAL, color=CITY_APPLD_VAL, alpha=.5)) +
          geom_point()

df_gg = df %>%
  filter(!is.na(CITY_APPLD_VAL)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=CITY_APPLD_VAL, y = IMPR_VAL, color=CITY_APPLD_VAL, alpha=.5)) +
          geom_point()

# account_info
# contains auxiliary data for each appraisal account in Account_Apprl_Year.

df = account_apprl_year %>%
  left_join(account_info) 
## Joining, by = c("ACCOUNT_NUM", "APPRAISAL_YR", "GIS_PARCEL_ID", "DIVISION_CD")
summary(factor(df$OWNER_CITY))
##           DALLAS           IRVING          GARLAND            PLANO 
##             2055               67               54               36 
##         MESQUITE       RICHARDSON          HOUSTON       CARROLLTON 
##               31               24               20               19 
##       CEDAR HILL   FARMERS BRANCH        ARLINGTON          COPPELL 
##               17               17               16               15 
##           DESOTO           FRISCO           AUSTIN        LANCASTER 
##               15               15               13               13 
##          ADDISON       LEWISVILLE      DUNCANVILLE          ROWLETT 
##               12               12               10               10 
##    GRAND PRAIRIE           ARGYLE       FORT WORTH         MCKINNEY 
##                9                7                7                7 
##           MURPHY           SACHSE        SOUTHLAKE            ALLEN 
##                6                6                6                5 
##     FLOWER MOUND        SAN DIEGO       SUGAR LAND         ATHERTON 
##                5                5                4                3 
##          ATLANTA           FORNEY           LUFKIN         ROCKWALL 
##                3                3                3                3 
##        SUNNYVALE    WICHITA FALLS            WYLIE                  
##                3                3                3                2 
##    BALCH SPRINGS          CORINTH           EULESS         FT WORTH 
##                2                2                2                2 
##             KATY        LOS ALTOS      LOS ANGELES          MIDLAND 
##                2                2                2                2 
##         NEW YORK    SAN FRANCISCO           SPRING       THE COLONY 
##                2                2                2                2 
##      TROPHY CLUB         WHITTIER            ALAMO          ARCADIA 
##                2                2                1                1 
##         BELLAIRE         BROOKLYN         BURLESON       BURLINGAME 
##                1                1                1                1 
##         CARLSBAD            CLUTE  COLLEGE STATION      COLLEYVILLE 
##                1                1                1                1 
##         COLUMBIA          COMBINE       COSTA MESA           DENTON 
##                1                1                1                1 
##           EUCLID             FATE            FLINT           FOLSOM 
##                1                1                1                1 
##      GAINESVILLE         GLENDALE HIGHLAND VILLAGE            HURST 
##                1                1                1                1 
##          KAUFMAN           KELLER            KEMAH         KINGWOOD 
##                1                1                1                1 
##      LAKE DALLAS          LINDSAY       LITTLE ELM         LOCKHART 
##                1                1                1                1 
##         MABLETON    MANAKIN SABOT         MARSHALL       MIDLOTHIAN 
##                1                1                1                1 
##    MISSOURI CITY    NEWPORT BEACH          NORFOLK           OVILLA 
##                1                1                1                1 
##           PALMER        PALO ALTO           PARKER      PLACERVILLE 
##                1                1                1                1 
##          PROSPER          QUITMAN          (Other)             NA's 
##                1                1               23            47341
df_gg = df %>%
  filter(!is.na(OWNER_CITY)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$OWNER_CITY))
##           DALLAS           IRVING          GARLAND            PLANO 
##             2055               67               54               36 
##         MESQUITE       RICHARDSON          HOUSTON       CARROLLTON 
##               31               24               20               19 
##       CEDAR HILL   FARMERS BRANCH        ARLINGTON          COPPELL 
##               17               17               16               15 
##           DESOTO           FRISCO           AUSTIN        LANCASTER 
##               15               15               13               13 
##          ADDISON       LEWISVILLE      DUNCANVILLE          ROWLETT 
##               12               12               10               10 
##    GRAND PRAIRIE           ARGYLE       FORT WORTH         MCKINNEY 
##                9                7                7                7 
##           MURPHY           SACHSE        SOUTHLAKE            ALLEN 
##                6                6                6                5 
##     FLOWER MOUND        SAN DIEGO       SUGAR LAND         ATHERTON 
##                5                5                4                3 
##          ATLANTA           FORNEY           LUFKIN         ROCKWALL 
##                3                3                3                3 
##        SUNNYVALE    WICHITA FALLS            WYLIE                  
##                3                3                3                2 
##    BALCH SPRINGS          CORINTH           EULESS         FT WORTH 
##                2                2                2                2 
##             KATY        LOS ALTOS      LOS ANGELES          MIDLAND 
##                2                2                2                2 
##         NEW YORK    SAN FRANCISCO           SPRING       THE COLONY 
##                2                2                2                2 
##      TROPHY CLUB         WHITTIER            ALAMO          ARCADIA 
##                2                2                1                1 
##         BELLAIRE         BROOKLYN         BURLESON       BURLINGAME 
##                1                1                1                1 
##         CARLSBAD            CLUTE  COLLEGE STATION      COLLEYVILLE 
##                1                1                1                1 
##         COLUMBIA          COMBINE       COSTA MESA           DENTON 
##                1                1                1                1 
##           EUCLID             FATE            FLINT           FOLSOM 
##                1                1                1                1 
##      GAINESVILLE         GLENDALE HIGHLAND VILLAGE            HURST 
##                1                1                1                1 
##          KAUFMAN           KELLER            KEMAH         KINGWOOD 
##                1                1                1                1 
##      LAKE DALLAS          LINDSAY       LITTLE ELM         LOCKHART 
##                1                1                1                1 
##         MABLETON    MANAKIN SABOT         MARSHALL       MIDLOTHIAN 
##                1                1                1                1 
##    MISSOURI CITY    NEWPORT BEACH          NORFOLK           OVILLA 
##                1                1                1                1 
##           PALMER        PALO ALTO           PARKER      PLACERVILLE 
##                1                1                1                1 
##          PROSPER          QUITMAN          (Other)             NA's 
##                1                1               23            47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=OWNER_CITY, alpha=.5)) +
          geom_point()

head(df)
summary(df)
##  ACCOUNT_NUM         APPRAISAL_YR     IMPR_VAL            LAND_VAL        
##  Length:50000       Min.   :2020   Min.   :        0   Min.   :        0  
##  Class :character   1st Qu.:2020   1st Qu.:    40180   1st Qu.:    21988  
##  Mode  :character   Median :2020   Median :   132605   Median :    51355  
##                     Mean   :2020   Mean   :   855632   Mean   :   337330  
##                     3rd Qu.:2020   3rd Qu.:   321300   3rd Qu.:   232748  
##                     Max.   :2020   Max.   :897418440   Max.   :135493880  
##                                                                           
##  LAND_AG_EXEMPT       AG_USE_VAL          TOT_VAL          HMSTD_CAP_VAL      
##  Min.   :       0   Min.   :    0.00   Min.   :       10   Min.   :       10  
##  1st Qu.:       0   1st Qu.:    0.00   1st Qu.:   101640   1st Qu.:    97700  
##  Median :       0   Median :    0.00   Median :   225395   Median :   221030  
##  Mean   :    4326   Mean   :   21.22   Mean   :  1192962   Mean   :  1189414  
##  3rd Qu.:       0   3rd Qu.:    0.00   3rd Qu.:   590000   3rd Qu.:   585550  
##  Max.   :25653360   Max.   :68944.00   Max.   :897418440   Max.   :897418440  
##  NA's   :6602       NA's   :6602                                              
##     REVAL_YR    PREV_REVAL_YR   PREV_MKT_VAL       TOT_CONTRIB_AMT   
##  Min.   :2018   Min.   :1000   Min.   :       10   Min.   :       0  
##  1st Qu.:2020   1st Qu.:2018   1st Qu.:    89320   1st Qu.:       0  
##  Median :2020   Median :2019   Median :   212060   Median :       0  
##  Mean   :2020   Mean   :2012   Mean   :  1116204   Mean   :   19055  
##  3rd Qu.:2020   3rd Qu.:2019   3rd Qu.:   565405   3rd Qu.:       0  
##  Max.   :2021   Max.   :2021   Max.   :835104710   Max.   :74280400  
##                                NA's   :629                           
##  TAXPAYER_REP       CITY_JURIS_DESC    COUNTY_JURIS_DESC  ISD_JURIS_DESC    
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  HOSPITAL_JURIS_DESC COLLEGE_JURIS_DESC SPECIAL_DIST_JURIS_DESC
##  Length:50000        Length:50000       Length:50000           
##  Class :character    Class :character   Class :character       
##  Mode  :character    Mode  :character   Mode  :character       
##                                                                
##                                                                
##                                                                
##                                                                
##  CITY_SPLIT_PCT      COUNTY_SPLIT_PCT    ISD_SPLIT_PCT      
##  Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
##  1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
##  Median :0.0000000   Median :0.0000000   Median :0.0000000  
##  Mean   :0.0005902   Mean   :0.0006672   Mean   :0.0005902  
##  3rd Qu.:0.0000000   3rd Qu.:0.0000000   3rd Qu.:0.0000000  
##  Max.   :1.0000000   Max.   :1.0000000   Max.   :1.0000000  
##                                                             
##  HOSPITAL_SPLIT_PCT  COLLEGE_SPLIT_PCT   SPECIAL_DIST_SPLIT_PCT
##  Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00      
##  1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00      
##  Median :0.0000000   Median :0.0000000   Median :0.00e+00      
##  Mean   :0.0005902   Mean   :0.0006102   Mean   :4.64e-05      
##  3rd Qu.:0.0000000   3rd Qu.:0.0000000   3rd Qu.:0.00e+00      
##  Max.   :1.0000000   Max.   :1.0000000   Max.   :1.00e+00      
##                                                                
##  CITY_TAXABLE_VAL    COUNTY_TAXABLE_VAL  ISD_TAXABLE_VAL    
##  Min.   :        0   Min.   :        0   Min.   :        0  
##  1st Qu.:    81853   1st Qu.:    78988   1st Qu.:    81440  
##  Median :   195040   Median :   191225   Median :   198544  
##  Mean   :  1036565   Mean   :  1016846   Mean   :  1034477  
##  3rd Qu.:   494028   3rd Qu.:   486730   3rd Qu.:   515000  
##  Max.   :897418440   Max.   :540000000   Max.   :540000000  
##                                                             
##  HOSPITAL_TAXABLE_VAL COLLEGE_TAXABLE_VAL SPECIAL_DIST_TAXABLE_VAL
##  Min.   :        0    Min.   :        0   Min.   :       0        
##  1st Qu.:    78988    1st Qu.:    78710   1st Qu.:       0        
##  Median :   191235    Median :   191016   Median :       0        
##  Mean   :  1018742    Mean   :  1039284   Mean   :   41197        
##  3rd Qu.:   486934    3rd Qu.:   487155   3rd Qu.:       0        
##  Max.   :540000000    Max.   :540000000   Max.   :57850000        
##                                                                   
##  CITY_CEILING_VALUE COUNTY_CEILING_VALUE ISD_CEILING_VALUE 
##  Min.   :   0       Min.   :    0.0      Min.   :     0.0  
##  1st Qu.:   0       1st Qu.:  105.5      1st Qu.:   838.1  
##  Median :   0       Median :  373.2      Median :  2541.3  
##  Mean   :  29       Mean   :  842.6      Mean   :  4944.3  
##  3rd Qu.:   0       3rd Qu.:  856.6      3rd Qu.:  5221.5  
##  Max.   :4421       Max.   :58832.5      Max.   :273345.8  
##  NA's   :46106      NA's   :46106        NA's   :46106     
##  HOSPITAL_CEILING_VALUE COLLEGE_CEILING_VALUE SPECIAL_DIST_CEILING_VALUE
##  Mode:logical           Min.   :0             Mode:logical              
##  NA's:50000             1st Qu.:0             NA's:50000                
##                         Median :0                                       
##                         Mean   :0                                       
##                         3rd Qu.:0                                       
##                         Max.   :0                                       
##                         NA's   :46106                                   
##    VID_IND          GIS_PARCEL_ID      APPRAISAL_METH_CD RENDITION_PENALTY 
##  Length:50000       Length:50000       Min.   :1.000     Length:50000      
##  Class :character   Class :character   1st Qu.:1.000     Class :character  
##  Mode  :character   Mode  :character   Median :1.000     Mode  :character  
##                                        Mean   :1.392                       
##                                        3rd Qu.:1.000                       
##                                        Max.   :3.000                       
##                                                                            
##  DIVISION_CD        EXTRNL_CNTY_ACCT   EXTRNL_CITY_ACCT   P_BUS_TYP_CD      
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BLDG_CLASS_CD       SPTD_CODE           BIZ_NAME         OWNER_NAME1       
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  OWNER_NAME2    EXCLUDE_OWNER      OWNER_ADDRESS_LINE1 OWNER_ADDRESS_LINE2
##  Mode:logical   Length:50000       Length:50000        Length:50000       
##  NA's:50000     Class :character   Class :character    Class :character   
##                 Mode  :character   Mode  :character    Mode  :character   
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##  OWNER_ADDRESS_LINE3 OWNER_ADDRESS_LINE4  OWNER_CITY        OWNER_STATE       
##  Mode:logical        Mode:logical        Length:50000       Length:50000      
##  NA's:50000          NA's:50000          Class :character   Class :character  
##                                          Mode  :character   Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  OWNER_ZIPCODE       OWNER_COUNTRY        STREET_NUM    STREET_HALF_NUM
##  Min.   :    75152   Length:50000       Min.   :    1   Mode:logical   
##  1st Qu.:752024038   Class :character   1st Qu.: 1654   NA's:50000     
##  Median :752143759   Mode  :character   Median : 2934                  
##  Mean   :751503788                      Mean   : 3492                  
##  3rd Qu.:752271302                      3rd Qu.: 4900                  
##  Max.   :956677816                      Max.   :17328                  
##  NA's   :47343                          NA's   :47341                  
##  FULL_STREET_NAME     BLDG_ID            UNIT_ID          PROPERTY_CITY     
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PROPERTY_ZIPCODE       MAPSCO            NBHD_CD             LEGAL1         
##  Min.   :    75006   Length:50000       Length:50000       Length:50000      
##  1st Qu.:752015508   Class :character   Class :character   Class :character  
##  Median :752142319   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :653610548                                                           
##  3rd Qu.:752195409                                                           
##  Max.   :752548544                                                           
##  NA's   :47341                                                               
##     LEGAL2             LEGAL3             LEGAL4             LEGAL5         
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  DEED_TXFR_DATE     PHONE_NUM          LMA                IMA           
##  Length:50000       Mode:logical   Length:50000       Length:50000      
##  Class :character   NA's:50000     Class :character   Class :character  
##  Mode  :character                  Mode  :character   Mode  :character  
##                                                                         
##                                                                         
##                                                                         
## 
colSums(is.na(df))
##                ACCOUNT_NUM               APPRAISAL_YR 
##                          0                          0 
##                   IMPR_VAL                   LAND_VAL 
##                          0                          0 
##             LAND_AG_EXEMPT                 AG_USE_VAL 
##                       6602                       6602 
##                    TOT_VAL              HMSTD_CAP_VAL 
##                          0                          0 
##                   REVAL_YR              PREV_REVAL_YR 
##                          0                          0 
##               PREV_MKT_VAL            TOT_CONTRIB_AMT 
##                        629                          0 
##               TAXPAYER_REP            CITY_JURIS_DESC 
##                          0                          0 
##          COUNTY_JURIS_DESC             ISD_JURIS_DESC 
##                          0                          0 
##        HOSPITAL_JURIS_DESC         COLLEGE_JURIS_DESC 
##                          0                          0 
##    SPECIAL_DIST_JURIS_DESC             CITY_SPLIT_PCT 
##                          0                          0 
##           COUNTY_SPLIT_PCT              ISD_SPLIT_PCT 
##                          0                          0 
##         HOSPITAL_SPLIT_PCT          COLLEGE_SPLIT_PCT 
##                          0                          0 
##     SPECIAL_DIST_SPLIT_PCT           CITY_TAXABLE_VAL 
##                          0                          0 
##         COUNTY_TAXABLE_VAL            ISD_TAXABLE_VAL 
##                          0                          0 
##       HOSPITAL_TAXABLE_VAL        COLLEGE_TAXABLE_VAL 
##                          0                          0 
##   SPECIAL_DIST_TAXABLE_VAL         CITY_CEILING_VALUE 
##                          0                      46106 
##       COUNTY_CEILING_VALUE          ISD_CEILING_VALUE 
##                      46106                      46106 
##     HOSPITAL_CEILING_VALUE      COLLEGE_CEILING_VALUE 
##                      50000                      46106 
## SPECIAL_DIST_CEILING_VALUE                    VID_IND 
##                      50000                          0 
##              GIS_PARCEL_ID          APPRAISAL_METH_CD 
##                          0                          0 
##          RENDITION_PENALTY                DIVISION_CD 
##                          0                          0 
##           EXTRNL_CNTY_ACCT           EXTRNL_CITY_ACCT 
##                          0                          0 
##               P_BUS_TYP_CD              BLDG_CLASS_CD 
##                          0                          0 
##                  SPTD_CODE                   BIZ_NAME 
##                          0                      47341 
##                OWNER_NAME1                OWNER_NAME2 
##                      47341                      50000 
##              EXCLUDE_OWNER        OWNER_ADDRESS_LINE1 
##                      47341                      47341 
##        OWNER_ADDRESS_LINE2        OWNER_ADDRESS_LINE3 
##                      47341                      50000 
##        OWNER_ADDRESS_LINE4                 OWNER_CITY 
##                      50000                      47341 
##                OWNER_STATE              OWNER_ZIPCODE 
##                      47341                      47343 
##              OWNER_COUNTRY                 STREET_NUM 
##                      47341                      47341 
##            STREET_HALF_NUM           FULL_STREET_NAME 
##                      50000                      47341 
##                    BLDG_ID                    UNIT_ID 
##                      47341                      47341 
##              PROPERTY_CITY           PROPERTY_ZIPCODE 
##                      47341                      47341 
##                     MAPSCO                    NBHD_CD 
##                      47341                      47341 
##                     LEGAL1                     LEGAL2 
##                      47341                      47341 
##                     LEGAL3                     LEGAL4 
##                      47341                      47341 
##                     LEGAL5             DEED_TXFR_DATE 
##                      47341                      47341 
##                  PHONE_NUM                        LMA 
##                      50000                      47341 
##                        IMA 
##                      47341

There aren’t many cities listed. We only want dallas anyways. But there are tons of NA.

df_gg = df %>%
  filter(!is.na(PROPERTY_CITY)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$PROPERTY_CITY))
##                   ADDISON             BALCH SPRINGS    CARROLLTON (DALLAS CO) 
##                         4                        10                        27 
##                CEDAR HILL             COCKRELL HILL                   COMBINE 
##                        21                         1                         1 
##       COPPELL (DALLAS CO)                    DALLAS                    DESOTO 
##                        14                      2363                        13 
##               DUNCANVILLE            FARMERS BRANCH       GARLAND (DALLAS CO) 
##                        10                         9                        42 
## GRAND PRAIRIE (DALLAS CO)             HIGHLAND PARK                    IRVING 
##                         6                         5                        62 
##                 LANCASTER      MESQUITE (DALLAS CO)                   NO TOWN 
##                         7                        33                         1 
##                RICHARDSON       ROWLETT (DALLAS CO)                    SACHSE 
##                         8                         9                         4 
##                 SUNNYVALE           UNIVERSITY PARK                    WILMER 
##                         2                         6                         1 
##                      NA's 
##                     47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=PROPERTY_CITY, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(PROPERTY_CITY)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=PROPERTY_CITY, alpha=.5)) +
          geom_point()

There aren’t many zipcodes for us to see

df_gg = df %>%
  filter(!is.na(PROPERTY_ZIPCODE)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$PROPERTY_ZIPCODE))
##     75201     75215     75202     75226     75204     75223     75246 752203452 
##        68        67        45        35        25        20        13         9 
##     75207     75208     75061 752095808 752096255 752261204     75212     75219 
##         8         8         7         7         6         6         5         5 
## 750061433 752102026 752155125 752195409 752261425 752261512     75203     75234 
##         5         5         5         5         5         5         4         4 
## 752014702 752055521 752095926 752095927 752096214 752122333 752154640 752154948 
##         4         4         4         4         4         4         4         4 
## 752155010 752155014 752155043 752155118 752155144 752155262 752194437 752233551 
##         4         4         4         4         4         4         4         4 
## 752461062     75006     75214 752011254 752013419 752014316 752015508 752015509 
##         4         3         3         3         3         3         3         3 
## 752015816 752021927 752023335 752043920 752045537 752045541 752045543 752055418 
##         3         3         3         3         3         3         3         3 
## 752082120 752083955 752093616 752093624 752095913 752095924 752143748 752143801 
##         3         3         3         3         3         3         3         3 
## 752143817 752143840 752144011 752144013 752144556 752151150 752151244 752151324 
##         3         3         3         3         3         3         3         3 
## 752154617 752154641 752154939 752154942 752154955 752154958 752155030 752155036 
##         3         3         3         3         3         3         3         3 
## 752155037 752155119 752155129 752155152 752155338 752155352 752155357 752193816 
##         3         3         3         3         3         3         3         3 
## 752231163 752231914 752233503 752261215 752261505 752261510 752261608 752261627 
##         3         3         3         3         3         3         3         3 
## 752261719 752262106   (Other)      NA's 
##         3         3      2048     47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=PROPERTY_ZIPCODE, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(PROPERTY_ZIPCODE)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=PROPERTY_ZIPCODE, alpha=.5)) +
          geom_point()

Res have lower value than commercial

df_gg = df %>%
  filter(!is.na(DIVISION_CD)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$DIVISION_CD))
##   BPP   COM   RES 
##  6410 13233 30357
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=DIVISION_CD, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(DIVISION_CD)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=DIVISION_CD, alpha=.5)) +
          geom_point()

df_gg = df %>%
  filter(!is.na(MAPSCO)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$MAPSCO))
## 56-G (DALLAS) 56-C (DALLAS) 45-L (DALLAS) 46-F (DALLAS) 56-D (DALLAS) 
##            80            72            63            63            62 
## 46-J (DALLAS) 45-M (DALLAS) 45-Q (DALLAS) 37-W (DALLAS) 45-U (DALLAS) 
##            58            55            54            48            47 
## 45-H (DALLAS) 47-J (DALLAS) 34-M (DALLAS) 45-K (DALLAS) 37-S (DALLAS) 
##            42            42            40            35            34 
## 45-P (DALLAS) 46-K (DALLAS) 45-F (DALLAS) 34-R (DALLAS) 45-D (DALLAS) 
##            34            34            31            28            27 
## 45-V (DALLAS) 46-B (DALLAS) 34-G (DALLAS) 35-T (DALLAS) 35-S (DALLAS) 
##            26            25            24            24            22 
## 46-A (DALLAS) 46-N (DALLAS) 46-L (DALLAS) 45-R (DALLAS) 46-C (DALLAS) 
##            22            21            20            18            18 
## 36-L (DALLAS) 36-V (DALLAS) 36-M (DALLAS) 46-E (DALLAS) 34-L (DALLAS) 
##            17            17            16            16            15 
## 35-R (DALLAS) 35-X (DALLAS) 36-P (DALLAS) 36-Q (DALLAS) 45-B (DALLAS) 
##            15            15            15            15            15 
## 36-N (DALLAS) 35-W (DALLAS) 36-K (DALLAS) 45-N (DALLAS) 46-R (DALLAS) 
##            14            13            13            13            13 
## 47-N (DALLAS) 55-G (DALLAS) 36-S (DALLAS) 36-Z (DALLAS) 45-C (DALLAS) 
##            13            13            12            12            12 
## 46-V (DALLAS) 47-A (DALLAS) 56-B (DALLAS) 36-H (DALLAS) 36-T (DALLAS) 
##            12            12            12            11            11 
## 36-W (DALLAS) 36-X (DALLAS) 43-K (DALLAS) 44-X (DALLAS) 45-G (DALLAS) 
##            11            11            11            11            11 
## 46-X (DALLAS) 35-V (DALLAS) 43-Z (DALLAS) 45-E (DALLAS) 46-H (DALLAS) 
##            11            10            10            10            10 
## 54-B (DALLAS) 54-G (DALLAS) 36-J (DALLAS) 36-U (DALLAS) 44-T (DALLAS) 
##            10            10             9             9             9 
## 46-M (DALLAS) 46-T (DALLAS) 46-Y (DALLAS) 54-C (DALLAS) 58-B (DALLAS) 
##             9             9             9             9             9 
##  2-K (DALLAS) 46-G (DALLAS) 47-E (DALLAS) 48-V (DALLAS) 54-N (DALLAS) 
##             8             8             8             8             8 
## 23-U (DALLAS) 33-M (DALLAS) 34-V (DALLAS) 44-Z (DALLAS) 45-J (DALLAS) 
##             7             7             7             7             7 
## 54-F (DALLAS) 34-C (DALLAS) 36-F (DALLAS) 37-T (DALLAS) 37-X (DALLAS) 
##             7             6             6             6             6 
## 44-P (DALLAS) 44-V (DALLAS) 44-Y (DALLAS) 45-Z (DALLAS) 46-S (DALLAS) 
##             6             6             6             6             6 
## 46-U (DALLAS) 54-S (DALLAS) 55-J (DALLAS)       (Other)          NA's 
##             6             6             6           796         47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=MAPSCO, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(MAPSCO)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=MAPSCO, alpha=.5)) +
          geom_point()

df_gg = df %>%
  filter(!is.na(NBHD_CD)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$NBHD_CD))
## UNASSIGNED     1DSA07     1DSA01     5DSF01     1DSD12     1DSM12     5DSS06 
##        830        175         35         33         32         32         32 
##     1DSQ01     1DSA04     1DSK01     1DSA02     1DSJ19     1DSA06     1DSM11 
##         31         30         30         27         26         23         22 
##     1DSJ11     1DSD03     4DSA8C     1DSJ03     1DSM15     1DSJ1H     1DSD11 
##         20         15         15         13         13         12         11 
##     1DSG11     4DSA03     1DSJ21     1DSJ22     1DSM13     1DSM17     1DSM19 
##         11         11         10         10         10         10         10 
##     1DSM21     1DSS04     4DSI04     1DSD04     1DSF01     1DSG08     4DSE13 
##         10         10         10          9          9          9          9 
##     5DS101     1DSJ06     1DSP10     4DSD44     4DSG15     5DS315     5DSL10 
##          9          8          8          8          8          8          8 
##     1DSD16     1DSJ12     4DSG08     4DSG49     4DSM05     4WSI26     1DSD17 
##          7          7          7          7          7          7          6 
##     1DSG06     1DSJ07     1DSJ18     1DSJ24     1DSM09     4DSE11     4DSM06 
##          6          6          6          6          6          6          6 
##     4DSP01     5DS299     1DS932     1DSB01     1DSJ13     1DSM20     1DSM26 
##          6          6          5          5          5          5          5 
##     1DSP09     1DSS06     4DSG03     4DSG11     4DSG40     4DSG47     4DSG70 
##          5          5          5          5          5          5          5 
##     4DSH16     4DSK02     4DSL02     4DSP02     5AS310     5DS120     5DSG03 
##          5          5          5          5          5          5          5 
##     5DSG04     5DSM08     1DS483     1DS907     1DSG09     1DSM05     1DSP02 
##          5          5          4          4          4          4          4 
##     1DSV01     2DSM02     4DS209     4DSD79     4DSG38     4DSG45     1DS204 
##          4          4          4          4          4          4          3 
##     1DS214     1DS500     1DSD08     1DSD15     1DSJ27     1DSM02     1DSM04 
##          3          3          3          3          3          3          3 
##    (Other)       NA's 
##        732      47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=NBHD_CD, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(NBHD_CD)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=NBHD_CD, alpha=.5)) +
          geom_point()

df_gg = df %>%
  filter(!is.na(LEGAL1)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$LEGAL1))
##                     LINCOLN MANOR                             IDEAL 
##                                66                                39 
##                   CROWDUS & AKARD                      SANTA MONICA 
##                                31                                31 
##                          FAIRVIEW        WEBSTER & WOODS NORTH PARK 
##                                26                                25 
##                          RAILROAD                 BRYAN PLACE REV 2 
##                                23                                22 
##             SMITH MURPHY & MARTIN                      ERVAY CEDARS 
##                                19                                17 
##                     OAKLAND PLACE                  UNIVERSITY PLACE 
##                                16                                15 
##                     VICKERY PLACE              COUNTRY CLUB ESTATES 
##                                14                                13 
##               SKYLINE HEIGHTS REV                    BEEMAN ESTATES 
##                                13                                12 
##                        CAMPS ALEX                      ELIZA MCCOYS 
##                                12                                12 
##            GASTON HOMESTEAD UNREC                        GASTONWOOD 
##                                12                                12 
##                    GRAVES HEIGHTS                   LINCOLN MANOR 2 
##                                12                                12 
##                  MOCKINGBIRD PARK                    COLUMBIA PLACE 
##                                12                                11 
##                             ELITE               LAKESHORE HILLS REV 
##                                11                                11 
##                        QUEEN CITY                     SPRING AVENUE 
##                                11                                11 
##                    CARUTH TERRACE                   EXPOSITION PARK 
##                                10                                10 
##                          PASADENA                    PEAKS SUBURBAN 
##                                10                                10 
##                          PIEDMONT                      WALNUT GROVE 
##                                10                                10 
##                          BROWDERS                       LAFOY PLACE 
##                                 9                                 9 
##                      MUNGER PLACE                   BEVERLY TERRACE 
##                                 9                                 8 
##              BROWDER'S PROVIDENCE                            ERWINS 
##                                 8                                 8 
##                             GOODS                    HIGHLAND HILLS 
##                                 8                                 8 
##                    JUNIUS HEIGHTS                        LOMAS PARK 
##                                 8                                 8 
##        MIDDLETON BROS CENTRAL AVE                             OZONE 
##                                 8                                 8 
##                         SNODGRASS                 TRADEWINDS CONDOS 
##                                 8                                 8 
##                        TUBERVILLE                                   
##                                 8                                 7 
##                  BELMONT SUBURBAN                        COCKRELL'S 
##                                 7                                 7 
##                      COLLEGE HILL            DALLAS LAND AND LOAN 2 
##                                 7                                 7 
##                    GASTON TERRACE                        H T LACKEY 
##                                 7                                 7 
##           ORIGINAL TOWN OF DALLAS                          ROBINSON 
##                                 7                                 7 
##        VERSAILLES PARKWAY REVISED                        ALTA VISTA 
##                                 7                                 6 
##                           BLK 217                       BLK 331 1/2 
##                                 6                                 6 
##                           BLK 801                  BURK & SLAUGHTER 
##                                 6                                 6 
##            C W GUILDS PARK AVENUE                  CHAPEL CREEK REV 
##                                 6                                 6 
##            DALLAS LAND AND LOAN 3                   DOLPHIN HEIGHTS 
##                                 6                                 6 
##                          E H RAYS                        EAST WE GO 
##                                 6                                 6 
##               EASTON CONDOMINIUMS                    GASTONWOOD REV 
##                                 6                                 6 
##  GREENLAND HILLS HILL TOP SECTION                     HASKELL PLACE 
##                                 6                                 6 
##    INDUSTRIAL IMPROVEMENT PROJECT                     J A JOHNSTONS 
##                                 6                                 6 
##                 J BOPPS HOMESTEAD                    J J EAKINS 2ND 
##                                 6                                 6 
##                    KESSLER SQUARE                            MCCOYS 
##                                 6                                 6 
##                        MONTICELLO                OAK CLIFF ORIGINAL 
##                                 6                                 6 
##                       T J RODGERS          1100 TRINITY MILLS CONDO 
##                                 6                                 5 
##                    BERGFELD PLACE BLANKENSHIP & BLAKE'S COTTON MILL 
##                                 5                                 5 
##                          BLK 2518                 BOB-O-LINKS DOWNS 
##                                 5                                 5 
##         BOWSER & LEMMONS OAK LAWN         DALLAS & WICHITA RAILROAD 
##                                 5                                 5 
##               EAST COLUMBIA PLACE                 EDGEFIELD HEIGHTS 
##                                 5                                 5 
##                    ELLINGTON PARK            GREENVILLE CREST ANNEX 
##                                 5                                 5 
##                      HERRLING F C                 KESSLER HIGHLANDS 
##                                 5                                 5 
##                   MONTEBELLO PARK RENAISSANCE ON TURTLE CREEK CONDO 
##                                 5                                 5 
##                           (Other)                              NA's 
##                              1666                             47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=LEGAL1, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(LEGAL1)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=LEGAL1, alpha=.5)) +
          geom_point()

df_gg = df %>%
  filter(!is.na(LMA)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$LMA))
##     1AS001     1ASVR1     1CA002     1DSC02     1DSCE7     1DSCE8     1DSCE9 
##          1          1          1          1          1          1         10 
##     1DSCEX     1DSD04     1DSDE1     1DSE05     1DSF05     1DSF06     1DSF4A 
##         15          1         91          1          1          1          1 
##     1DSG07     1DSG12     1DSG16     1DSH06     1DSH07     1DSH16     1DSH6B 
##          1          1          1          5          1          2          3 
##     1DSH9A     1DSH9B     1DSH9C     1DSI06     1DSI07     1DSI09     1DSI10 
##          3          2          1          1          1          4         29 
##     1DSI11     1DSI12     1DSI5A     1DSI7A     1DSJ06     1DSJ09     1DSJ10 
##          4          5          3          7          3         23         88 
##     1DSJ11     1DSJ6A     1DSJ7A     1DSJ7B     1DSJ7C     1DSJ7D     1DSJ7E 
##          8          8         37         27          4         10         20 
##     1DSJ7F     1DSJ7G     1DSK04     1DSK07     1DSK6A     1DSK7A     1DSOL1 
##         38         44          1         12          9          8          9 
##     1DSOL2     1DSOL3     1DSOL4     1DSOL5     1DSOL6     1DSOL7     1DSOL8 
##          3          2          2          5         20         18         16 
##     1ISA08     1ISB05     1ISB08     1ISD08     1OS001     1OS002     2DSK08 
##          3          1          1          1          1          1         10 
##     2DSK09     2DSK10     2DSK11     2DSK12     2DSK13     2DSK14     2DSK15 
##         47         20         10          1          9          4          3 
##     2DSK16     2DSK17     2DSK21     2DSK9A     2DSL06     2DSL08     2DSL09 
##         10          1          1         45          1          1          2 
##     2DSL10     2DSL11     2DSL4A     2DSL6A     2DSL6B     2DSL7A     2DSM05 
##          1          2          1          4          9          6          1 
##     2DSM08     2DSM12     2DSM6A     2DSN11     2DSP6A     2DSQ04     2DSR14 
##          1          1          1          1          1          1          2 
##     2PS007     3GSG03     3MSA02     3MSC02     3MSD03     3MSF01     3MSG02 
##          2          1          1          1          1          1          2 
## UNASSIGNED       NA's 
##       1829      47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=LMA, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(LMA)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=LMA, alpha=.5)) +
          geom_point()

THERE IS NOTHING TO SEE IN IMA

df_gg = df %>%
  filter(!is.na(IMA)) %>%
  filter(IMPR_VAL < 37794550)

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$IMA))
##     100100     100300     100800     110100     110400     111100     211800 
##          5          1          2         12          2          1          5 
##     212300     360100     3602OL     360500     360600     360800     360900 
##          1         15          2          2          4          3         15 
##     3609DE     540100     540200     540300     540800     541500     541700 
##         18         46         10         10          1          2          1 
##     660900     720100     720200     720400     720500     720800     720900 
##          2          3          3          1          5          2          7 
## UNASSIGNED       NA's 
##       2478      47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=IMA, alpha=.5)) +
          geom_point()

# see the same for points with <2k improvement value
df_gg = df %>%
  filter(!is.na(IMA)) %>%
  filter(IMPR_VAL < 1000)

ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=IMA, alpha=.5)) +
          geom_point()

df_gg = df %>%
  filter(!is.na(IMA)) %>%
  filter(IMPR_VAL < 1000) 

#  select(APPRAISAL_YR, DIVISION_CD, BIZ_NAME, OWNER_CITY, OWNER_STATE, OWNER_ZIPCODE, OWNER_COUNTRY, PROPERTY_CITY, PROPERTY_ZIPCODE, MAPSCO, NBHD_CD, LEGAL1, LEGAL2, DEED_TXFR_DATE, LMA, IMA)
summary(factor(df$IMA))
##     100100     100300     100800     110100     110400     111100     211800 
##          5          1          2         12          2          1          5 
##     212300     360100     3602OL     360500     360600     360800     360900 
##          1         15          2          2          4          3         15 
##     3609DE     540100     540200     540300     540800     541500     541700 
##         18         46         10         10          1          2          1 
##     660900     720100     720200     720400     720500     720800     720900 
##          2          3          3          1          5          2          7 
## UNASSIGNED       NA's 
##       2478      47341
ggplot(df_gg, aes(x=TOT_VAL, y = IMPR_VAL, color=IMA, alpha=.5)) +
          geom_point()

almost all the columns highly correlate with tot_val and are very right skewed

# Abatement exempt
# about properties for which abatements have been granted
# head(abatement_exempt)
# # df = abatement_exempt
# columns_list = list(apprl_yr = abatement_exempt$APPRAISAL_YR, 
#                     tot_val = abatement_exempt$TOT_VAL, 
#                     city_eff_yr = abatement_exempt$CITY_EFF_YR, 
#                     city_val_dif = abatement_exempt$CITY_VAL_DIF, 
#                     cnty_val_dif = abatement_exempt$CNTY_VAL_DIF, 
#                     isd_val_dif = abatement_exempt$ISD_VAL_DIF, 
#                     coll_val_dif = abatement_exempt$COLL_VAL_DIF, 
#                     spec_exemption_pct = abatement_exempt$SPEC_EXEMPTION_PCT, 
#                     spec_val_dif = abatement_exempt$SPEC_VAL_DIF, 
#                     spec_exemption_amt = abatement_exempt$SPEC_EXEMPTION_AMT)
# i = 1
# for(column in columns_list){
#   name = names(columns_list)[i]
#   plot(density(x=column), main = paste("Density Plot of", name))
#   
#   plot(x=abatement_exempt$TOT_VAL, y = column, main = paste("Scatterplot of tot_val vs", name))
#   i = i+1
# }
# 
#Account_apprl_year
# primary or central table in this dataset, it contains summary information about all appraisal accounts.


#head(account_apprl_year)
#df = account_apprl_year

# columns_list = list(impr_val = df$IMPR_VAL,
#   land_val = df$LAND_VAL,
#   tot_val = df$TOT_VAL,
#   hmstd_cap_val = df$HMSTD_CAP_VAL,
#   prev_reval_yr = df$PREV_REVAL_YR,
#   prev_mkt_val = df$PREV_MKT_VAL,
#   city_taxable_val = df$CITY_TAXABLE_VAL,
#   county_taxable_val = df$COUNTY_TAXABLE_VAL,
#   isd_taxable_val = df$ISD_TAXABLE_VAL,
#   hospital_taxable_val = df$HOSPITAL_TAXABLE_VAL,
#   college_taxable_val = df$COLLEGE_TAXABLE_VAL,
#   county_ceiling_value = df$COUNTY_CEILING_VALUE,
#   isd_ceiling_value = df$ISD_CEILING_VALUE,
#   appraisal_meth_cd = df$APPRAISAL_METH_CD
#   )
# 
# i = 1
# for(column in columns_list){
#   name = names(columns_list)[i]
#   gg_df = data.frame(x = column, tot_val = df$TOT_VAL)
#   #print(head(gg_df))
#   print(ggplot(gg_df, aes(x=x, y=tot_val)) + 
#            geom_point() +
#           ggtitle(paste("Scatterplot of", name)))
#   
#   #plot(density(x=column), main = paste("Density Plot of", name))
#   
#   #plot(x=df$TOT_VAL, y = column, main = paste("Scatterplot of tot_val vs", name))
#   i = i+1
# }
summary(account_info)
##  ACCOUNT_NUM         APPRAISAL_YR  DIVISION_CD          BIZ_NAME        
##  Length:50000       Min.   :2020   Length:50000       Length:50000      
##  Class :character   1st Qu.:2020   Class :character   Class :character  
##  Mode  :character   Median :2020   Mode  :character   Mode  :character  
##                     Mean   :2020                                        
##                     3rd Qu.:2020                                        
##                     Max.   :2020                                        
##                                                                         
##  OWNER_NAME1        OWNER_NAME2    EXCLUDE_OWNER      OWNER_ADDRESS_LINE1
##  Length:50000       Mode:logical   Length:50000       Length:50000       
##  Class :character   NA's:50000     Class :character   Class :character   
##  Mode  :character                  Mode  :character   Mode  :character   
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##  OWNER_ADDRESS_LINE2 OWNER_ADDRESS_LINE3 OWNER_ADDRESS_LINE4  OWNER_CITY       
##  Length:50000        Mode:logical        Mode:logical        Length:50000      
##  Class :character    NA's:50000          NA's:50000          Class :character  
##  Mode  :character                                            Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##  OWNER_STATE        OWNER_ZIPCODE       OWNER_COUNTRY        STREET_NUM    
##  Length:50000       Min.   :    75050   Length:50000       Min.   :     1  
##  Class :character   1st Qu.:751343039   Class :character   1st Qu.:  1412  
##  Mode  :character   Median :752122132   Mode  :character   Median :  2909  
##                     Mean   :751968379                      Mean   :  3616  
##                     3rd Qu.:752232123                      3rd Qu.:  4859  
##                     Max.   :997016578                      Max.   :900009  
##                     NA's   :66                                             
##  STREET_HALF_NUM FULL_STREET_NAME     BLDG_ID            UNIT_ID         
##  Mode:logical    Length:50000       Length:50000       Length:50000      
##  NA's:50000      Class :character   Class :character   Class :character  
##                  Mode  :character   Mode  :character   Mode  :character  
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##  PROPERTY_CITY      PROPERTY_ZIPCODE       MAPSCO            NBHD_CD         
##  Length:50000       Min.   :    75001   Length:50000       Length:50000      
##  Class :character   1st Qu.:751155736   Class :character   Class :character  
##  Mode  :character   Median :752116421   Mode  :character   Mode  :character  
##                     Mean   :706756744                                        
##                     3rd Qu.:752194206                                        
##                     Max.   :760512709                                        
##                     NA's   :3                                                
##     LEGAL1             LEGAL2             LEGAL3             LEGAL4         
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     LEGAL5          DEED_TXFR_DATE     GIS_PARCEL_ID      PHONE_NUM     
##  Length:50000       Length:50000       Length:50000       Mode:logical  
##  Class :character   Class :character   Class :character   NA's:50000    
##  Mode  :character   Mode  :character   Mode  :character                 
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##      LMA                IMA           
##  Length:50000       Length:50000      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
# Loop

# i = 1
# for(column in names(df)){
#   print(column)
#   name = column
#   print(length(df[[column]]))
#   print(length(df$OWNER_CITY))
#   gg_df = data.frame(x = df[[column]], OWNER_CITY = df$OWNER_CITY)
# 
#   print(head(gg_df))
#   print(ggplot(gg_df, aes(x=x, y = ..count.., fill=x)) +
#           geom_bar())
#   #print(ggplot(gg_df, aes(x=x, y= ..count..)) +
#   #  geom_bar(aes(fill = OWNER_CITY), position = "dodge") + 
#   #  ggtitle(paste("Scatterplot of", name)))
#   
#   #plot(density(x=column), main = paste("Density Plot of", name))
#   
#   #plot(x=df$TOT_VAL, y = column, main = paste("Scatterplot of tot_val vs", name))
#   i = i+1
# }
# account_tif
# contains information about appraisal accounts in Tax Increment Finance districts.
# acct_exempt_value
# contains the value of each exemption granted
# applied_std_exempt
# contains information about standard exemptions granted.

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.